# -*- coding: UTF-8 -*-
import os
import sys

import requests
from bs4 import BeautifulSoup
import django
import json
import re
import time
import math
import jieba.analyse
import random

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_cms.settings")
django.setup()
from crawler.models import *

SEARCH_URL = 'https://m.sm.cn/s?q={}&by=relative'
USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
from newspaper import Article

from simple_cms.backends import Download
from cms import models


def go():
    keywords = Keyword.objects.filter(enable=True).all()
    for key in keywords:
        print(key)
        get_relative(key.word)
        # break


def get_relative(word):
    """获取相关词"""
    r = requests.get(SEARCH_URL.format(word), headers={
        'User-Agent': USER_AGENT
    }, timeout=(5, 10))
    if r.status_code != 200:
        print('response code error:{}'.format(r.status_code))
        return
    soup = BeautifulSoup(r.text, "lxml")
    relative = soup.find(class_='relative-keywords')
    if not relative:
        raise RuntimeError('相关搜索词没有找到')
    rec_content = relative.attrs['rec_content']
    relative_words = json.loads(rec_content)
    print(relative_words)

    # 调出不重复的
    # news_words = []
    for w in relative_words:
        exists = History.objects.filter(word=w).exists()
        if not exists:
            get_links(w)
            # 抓完一个，就把相关词存入数据库
            History.objects.create(word=w)


def get_tags(content):
    return list(jieba.analyse.extract_tags(content, topK=5))


def get_links(word):
    print('开始抓取关键词的链接：{}'.format(word))
    begin = time.time()

    r = requests.get(SEARCH_URL.format(word), headers={
        'User-Agent': USER_AGENT
    }, timeout=(60, 60))
    if r.status_code != 200:
        print('response code error:{}'.format(r.status_code))
        return
    soup = BeautifulSoup(r.text, 'lxml')
    links = soup.find_all(class_='c-header-inner')
    hrefs = []
    for link in links:
        if 'href' in link.attrs:
            href = link.attrs.get('href')
            if href.find('http') == 0 and href.find('sm.cn') == -1 and href.find('baidu.com') == -1 and href.find(
                    'taobao.com') == -1:
                hrefs.append(href)
    print(hrefs)
    if len(hrefs) == 0:
        return
        # 开始获取文章
    # 把文章分段，如果是一篇文章，就取100%段落，2篇，就各取50%

    # 计算百分比
    rate = 1 / len(hrefs)
    print('每篇文章摘取比例:{}'.format(rate))

    results = []
    cover_image = None
    images = []

    for url in hrefs:
        try:
            text, top_image = get_article(url)
            if len(text) < 100:
                print('内容太短，中断执行：{}'.format(text))
                continue
        except:
            print('出错跳过一个链接')
            continue
        if top_image and top_image.find('http') != -1 and len(images) < 3:
            img = top_image.split('?')[0]
            if os.path.splitext(img)[1] != '.ico' and os.path.splitext(img)[1] != '':
                images.append(img)
        array = text.split('\n')

        # 段落去重
        temp = []
        for line in array:
            if line not in temp:
                # 如果段落是日期，也丢弃
                if len(clear(line)) == 0:
                    continue
                # 段落小于10 丢弃
                if len(line) < 10:
                    continue
                temp.append(line)

        size = int(rate * len(array))
        if size >= len(temp):
            results.extend(temp)
        else:
            for i in range(0, len(temp)):
                results.append(temp[i])
    if len(results) == 0:
        print('{}=没有正文，中断执行'.format(word))
        return
        # 文章内容结束

    # 下载3张图片到本地
    for index, url in enumerate(images):
        try:
            print('开始下载图片：{}'.format(url))
            r = requests.get(url, timeout=(5, 10))
            if r.status_code and r.content:
                images[index] = Download().save(os.path.split(url)[1], r.content)
                if not cover_image:
                    cover_image = images[index]
        except Exception as e:
            print('下载图片报错：{}'.format(e))
    # 3张图，插入到顶部，中间和末尾
    print('图片下载完成')

    # 对文章内容进行混排，然后3-4句合成一段。

    html = []
    if len(images) > 0:
        html.append('<img src="{}">'.format(images[0]))
    center = False
    if len(images) >= 2:
        center = True

    tags = get_tags('\n'.join(results))

    results = random_content(results)
    # 文章标题
    title = get_title(word, results[0])
    print('标题：{}'.format(title))
    print('标签：{}'.format(tags))
    for index, line in enumerate(results):

        html.append('<p>{}</p>'.format(line))
        if center and math.ceil(len(results) / 2) == index:
            html.append('<img src="{}">'.format(images[1]))

    if len(images) >= 3:
        html.append('<img src="{}">'.format(images[2]))

    end = time.time()
    print('图片：{}'.format(images))
    print('关键词：{}\n执行耗时：{}ms\n'.format(word, int((end - begin) * 100)))

    summary = ''.join(results)[0:120]
    # 入库
    save_article(title, summary, tags, cover_image, '\n'.join(html))


def get_article(url):
    print('抓取链接：{}'.format(url))
    news = Article(url, language='zh')
    news.download()
    news.parse()

    m = re.findall('[^\s]+', news.text, re.S)
    text = '\n'.join(m)
    # print(text)
    return text, news.top_image


def get_title(word, text):
    index = text.find(',')
    if index == -1:
        index = text.find('.')
        if index == -1:
            index = text.find('，')
            if index == -1:
                index = text.find("。")

    if index == -1:
        title = text[0:30]
    else:
        title = text[0:index]
    rs = '{} {}'.format(word, title)
    if word in title:
        rs = title[0:30]
    else:
        rs = rs[0:30]
    return rs


category_id = None
c = models.Category.objects.filter(alias='other').first()
if c:
    category_id = c.id

sensitives = list(Sensitive.objects.all())


def save_article(title, summary, tags, cover, content):
    print('保存文章,标题={}'.format(title))

    # 对标题、简介、内容进行敏感词过滤
    for w in sensitives:
        summary = summary.replace(w.word, w.replace)
        content = content.replace(w.word, w.replace)
        title = title.replace(w.word, w.replace)

    # 文章存到其他分类下面
    tag_dbs = []
    for tag in tags:
        tag_dbs.append(get_tag(tag))
    try:
        db = models.Article.objects.create(
            title=title,
            summary=summary,
            cover=cover,
            content=content,
            published=False,
            category_id=category_id
        )
        db.tags.add(*tag_dbs)
    except Exception as e:
        print('保存文章出错')
        print(e)


def random_content(content):
    """随机颠倒"""

    # 按句号随机
    if type(content) == list:
        arrays = content
    else:
        arrays = content.split('。')
    for i, line in enumerate(arrays):
        # 按逗号随机
        temps = line.split("，")
        size = len(temps)
        rs = random.sample(temps, size)
        arrays[i] = '，'.join(rs)
        # print(arrays[i])
    arrays = random.sample(arrays, len(arrays))
    results = []
    # 2-3句一段
    index = 0
    while index < len(arrays):
        ran = random.randint(2, 4)
        end = index + ran
        if end > len(arrays):
            end = len(arrays)

        temps = []
        for i in range(index, end):
            temps.append(arrays[i])
        index += ran
        results.append("。".join(temps))

    return results


def get_tag(name):
    tag = models.Tag.objects.filter(name=name).first()
    if not tag:
        tag = models.Tag.objects.create(
            name=name
        )
    # 更新引用
    tag.use += 1
    tag.save()
    return tag


def clear(line):
    return re.sub(r'(\d{2,4}\D\d{1,2}\D\d{1,2}\D{0,1})|\s+', '', line)


if __name__ == '__main__':
    go()
    # r = random_content(
    #     '金青鸟是一种外形很漂亮的宠物鸟，价格会因季节而有不同，在春季一只金青鸟价格在250至300元左右，冬季时金青鸟的价格会更高点。金青鸟价格1、开春后大量出小鸟时，在市面上一般250-300元能找到很不错的雏鸟。12月份多是换毛后的出的小鸟，数量比较少，价钱正常是三百或以上。2、好的母种鸟300—500元，公的要500—1000元。买种鸟**是亲自去别人的繁殖场选为好，选的母鸟**是有抱窝经验的，母鸟主要看它的抱窝型、品相也重要。公鸟要选会四喜口，而且各项条件都满意的公鸟。金青鸟的特点1、漂亮的外形金青鸟外形很漂亮，尤其是雄鸟，额头上两道粗粗的黄眉在额头中心相连，俗称通眉；另有一道黄眉生在眼下，像是腮红点缀着脸庞；背上的羽毛像鸭背一层绿一层黄；喉部至胸腹部是鲜艳的金黄色羽毛；尾羽的尾部又是白颜色的，非常好看。小鸟与母鸟的羽毛颜色较暗淡。2、委婉动听的歌喉这种鸟雌雄都可鸣叫，但雄鸟优秀的歌喉使得人听人爱。一般雄鸟都会唱七、八个调，而且可同时发出几种音阶声，令人以为有很多鸟在唱歌，让人越听越舒服。3、个体小、主食简单金青鸟体长12')
    # print('random：')
    # print("\n".join(r))
    # r = get_title('金青鸟',
    #               '金青鸟是一种外形很漂亮的宠物鸟，价格会因季节而有不同，在春季一只金青鸟价格在250至300元左右，冬季时金青鸟的价格会更高点。金青鸟价格1、开春后大量出小鸟时，在市面上一般250-300元能找到很不错的雏鸟。12月份多是换毛后的出的小鸟，数量比较少，价钱正常是三百或以上。2、好的母种鸟300—500元，公的要500—1000元。买种鸟**是亲自去别人的繁殖场选为好，选的母鸟**是有抱窝经验的，母鸟主要看它的抱窝型、品相也重要。公鸟要选会四喜口，而且各项条件都满意的公鸟。金青鸟的特点1、漂亮的外形金青鸟外形很漂亮，尤其是雄鸟，额头上两道粗粗的黄眉在额头中心相连，俗称通眉；另有一道黄眉生在眼下，像是腮红点缀着脸庞；背上的羽毛像鸭背一层绿一层黄；喉部至胸腹部是鲜艳的金黄色羽毛；尾羽的尾部又是白颜色的，非常好看。小鸟与母鸟的羽毛颜色较暗淡。2、委婉动听的歌喉这种鸟雌雄都可鸣叫，但雄鸟优秀的歌喉使得人听人爱。一般雄鸟都会唱七、八个调，而且可同时发出几种音阶声，令人以为有很多鸟在唱歌，让人越听越舒服。3、个体小、主食简单金青鸟体长12')
    # print(r)
    # pass
    # get_links('黄鹂鸟怎么唱歌')
    # get_article('https://www-360doc-cn.sm-tc.cn/c/www.360doc.cn/mip/2810612.html')
    # r = requests.get('https://avatar.csdn.net/4/6/1/3_u010402518.jpg')
    # Download().save(os.path.split('https://avatar.csdn.net/4/6/1/3_u010402518.jpg')[1], r.content)
